# !pip install joypy
# !pip install eli5
#!pip install bubbly
import warnings
warnings.filterwarnings('ignore')
# for some basic operations
import numpy as np
import pandas as pd
import joypy
# for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import plotting
from pandas.plotting import parallel_coordinates
# for interactive visualizations
import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff
# for animated visualizations
from bubbly.bubbly import bubbleplot
# for providing path
import os
# for modelling
import sklearn
import imblearn
# for model explanation
import shap
# let's import the data
data = pd.read_csv('insurance_claims.csv')
# let's take a look at the data
data.head()
| months_as_customer | age | policy_number | policy_bind_date | policy_state | policy_csl | policy_deductable | policy_annual_premium | umbrella_limit | insured_zip | ... | police_report_available | total_claim_amount | injury_claim | property_claim | vehicle_claim | auto_make | auto_model | auto_year | fraud_reported | _c39 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 328 | 48 | 521585 | 2014-10-17 | OH | 250/500 | 1000 | 1406.91 | 0 | 466132 | ... | YES | 71610 | 6510 | 13020 | 52080 | Saab | 92x | 2004 | Y | NaN |
| 1 | 228 | 42 | 342868 | 2006-06-27 | IN | 250/500 | 2000 | 1197.22 | 5000000 | 468176 | ... | ? | 5070 | 780 | 780 | 3510 | Mercedes | E400 | 2007 | Y | NaN |
| 2 | 134 | 29 | 687698 | 2000-09-06 | OH | 100/300 | 2000 | 1413.14 | 5000000 | 430632 | ... | NO | 34650 | 7700 | 3850 | 23100 | Dodge | RAM | 2007 | N | NaN |
| 3 | 256 | 41 | 227811 | 1990-05-25 | IL | 250/500 | 2000 | 1415.74 | 6000000 | 608117 | ... | NO | 63400 | 6340 | 6340 | 50720 | Chevrolet | Tahoe | 2014 | Y | NaN |
| 4 | 228 | 44 | 367455 | 2014-06-06 | IL | 500/1000 | 1000 | 1583.91 | 6000000 | 610706 | ... | NO | 6500 | 1300 | 650 | 4550 | Accura | RSX | 2009 | N | NaN |
5 rows × 40 columns
# let's take a look at the sample of the data
data.sample(5)
| months_as_customer | age | policy_number | policy_bind_date | policy_state | policy_csl | policy_deductable | policy_annual_premium | umbrella_limit | insured_zip | ... | police_report_available | total_claim_amount | injury_claim | property_claim | vehicle_claim | auto_make | auto_model | auto_year | fraud_reported | _c39 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 343 | 334 | 47 | 156694 | 2001-05-24 | IL | 500/1000 | 500 | 1238.89 | 0 | 600561 | ... | NO | 6240 | 960 | 960 | 4320 | Ford | Fusion | 2011 | N | NaN |
| 688 | 290 | 47 | 885789 | 2008-07-21 | IN | 250/500 | 1000 | 1393.34 | 0 | 472922 | ... | YES | 56160 | 6240 | 12480 | 37440 | Audi | A5 | 2002 | N | NaN |
| 465 | 33 | 33 | 758740 | 1997-08-04 | IL | 500/1000 | 1000 | 1096.79 | 6000000 | 446898 | ... | ? | 81400 | 8140 | 8140 | 65120 | BMW | M5 | 1998 | N | NaN |
| 23 | 413 | 55 | 115399 | 1991-02-08 | IN | 100/300 | 2000 | 1268.79 | 0 | 453148 | ... | ? | 98160 | 8180 | 16360 | 73620 | Dodge | RAM | 2011 | Y | NaN |
| 888 | 381 | 55 | 963761 | 1991-04-13 | OH | 500/1000 | 500 | 1459.99 | 0 | 445856 | ... | YES | 60600 | 12120 | 6060 | 42420 | Accura | TL | 2011 | N | NaN |
5 rows × 40 columns
# let's check whether the data has any null values or not.
# but there are '?' in the datset which we have to remove by NaN Values
data = data.replace('?',np.NaN)
data.isnull().any()
months_as_customer False age False policy_number False policy_bind_date False policy_state False policy_csl False policy_deductable False policy_annual_premium False umbrella_limit False insured_zip False insured_sex False insured_education_level False insured_occupation False insured_hobbies False insured_relationship False capital-gains False capital-loss False incident_date False incident_type False collision_type True incident_severity False authorities_contacted False incident_state False incident_city False incident_location False incident_hour_of_the_day False number_of_vehicles_involved False property_damage True bodily_injuries False witnesses False police_report_available True total_claim_amount False injury_claim False property_claim False vehicle_claim False auto_make False auto_model False auto_year False fraud_reported False _c39 True dtype: bool
# missing value treatment using fillna
# we will replace the '?' by the most common collision type as we are unaware of the type.
data['collision_type'].fillna(data['collision_type'].mode()[0], inplace = True)
# It may be the case that there are no responses for property damage then we might take it as No property damage.
data['property_damage'].fillna('NO', inplace = True)
# again, if there are no responses fpr police report available then we might take it as No report available
data['police_report_available'].fillna('NO', inplace = True)
data.isnull().any().any()
True
data.drop(['_c39'], axis=1, inplace=True)
fraud = data['fraud_reported'].value_counts()
label_fraud = fraud.index
size_fraud = fraud.values
colors = ['silver', 'gold']
trace = go.Pie(
labels = label_fraud, values = size_fraud, marker = dict(colors = colors), name = 'Frauds', hole = 0.3)
df = [trace]
layout = go.Layout(
title = 'Distribution of Frauds')
fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
fig, axes = joypy.joyplot(data,
column = ['incident_hour_of_the_day','number_of_vehicles_involved', 'witnesses'],
by = 'incident_city',
ylim = 'own',
figsize = (20, 10),
alpha = 0.5,
legend = True)
plt.title('Incident hour, No. of vehicles, witnesses vs Incident City', fontsize = 20)
plt.show()
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)
sns.stripplot(data['property_damage'], data['property_claim'], palette = 'bone')
plt.title('Incident Type vs Vehicle Claim', fontsize = 20)
plt.show()
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)
sns.boxenplot(data['incident_type'], data['vehicle_claim'], palette = 'pink')
plt.title('Incident Type vs Vehicle Claim', fontsize = 20)
plt.show()
incident = pd.crosstab(data['incident_city'], data['incident_type'])
colors = plt.cm.Blues(np.linspace(0, 1, 5))
incident.div(incident.sum(1).astype(float), axis = 0).plot(kind = 'bar',
stacked = False,
figsize = (15, 7),
color = colors)
plt.title('Incident Type vs Collision Type', fontsize = 20)
plt.legend()
plt.show()
incident = pd.crosstab(data['incident_type'], data['incident_severity'])
colors = plt.cm.summer(np.linspace(0, 1, 5))
incident.div(incident.sum(1).astype(float), axis = 0).plot(kind = 'bar',
stacked = False,
figsize = (15, 7),
color = colors)
plt.title('Incident Type vs Collision Type', fontsize = 20)
plt.legend()
plt.show()
incident = pd.crosstab(data['incident_type'], data['collision_type'])
colors = plt.cm.inferno(np.linspace(0, 1, 5))
incident.div(incident.sum(1).astype(float), axis = 0).plot(kind = 'bar',
stacked = True,
figsize = (15, 7),
color = colors)
plt.title('Incident Type vs Collision Type', fontsize = 20)
plt.legend()
plt.show()
# let's check the insured hobbies
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)
sns.countplot(data['insured_occupation'], palette = 'PuRd')
plt.title('Different Types of Occupation of Insured Customers', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()
# let's check the insured hobbies
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)
sns.countplot(data['insured_hobbies'], palette = 'cool')
plt.title('Different Types of Hobbies of Insured Customers', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()
# let's check the incident types
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)
sns.countplot(data['incident_type'], palette = 'spring')
plt.title('Different Types of Incidents', fontsize = 20)
plt.show()
# swarm plot
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)
sns.swarmplot(data['policy_state'], data['total_claim_amount'], palette = 'copper')
plt.title('Policy State vs Total Claim Amount', fontsize = 20)
plt.show()
# Plot
plt.figure(figsize=(20, 10), dpi= 80)
parallel_coordinates(data[['total_claim_amount','injury_claim', 'property_claim','vehicle_claim','fraud_reported']],
'fraud_reported', colormap = 'copper')
# Lighten borders
plt.gca().spines["top"].set_alpha(0)
plt.gca().spines["bottom"].set_alpha(.3)
plt.gca().spines["right"].set_alpha(0)
plt.gca().spines["left"].set_alpha(.3)
plt.title('DC', fontsize = 20)
plt.grid(alpha=0.3)
plt.suptitle('total claim, Injury claim, Property claim, vehicle claim vs Fraud Reported', fontsize = 20)
plt.show()
data.head()
| months_as_customer | age | policy_number | policy_bind_date | policy_state | policy_csl | policy_deductable | policy_annual_premium | umbrella_limit | insured_zip | ... | witnesses | police_report_available | total_claim_amount | injury_claim | property_claim | vehicle_claim | auto_make | auto_model | auto_year | fraud_reported | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 328 | 48 | 521585 | 2014-10-17 | OH | 250/500 | 1000 | 1406.91 | 0 | 466132 | ... | 2 | YES | 71610 | 6510 | 13020 | 52080 | Saab | 92x | 2004 | Y |
| 1 | 228 | 42 | 342868 | 2006-06-27 | IN | 250/500 | 2000 | 1197.22 | 5000000 | 468176 | ... | 0 | NO | 5070 | 780 | 780 | 3510 | Mercedes | E400 | 2007 | Y |
| 2 | 134 | 29 | 687698 | 2000-09-06 | OH | 100/300 | 2000 | 1413.14 | 5000000 | 430632 | ... | 3 | NO | 34650 | 7700 | 3850 | 23100 | Dodge | RAM | 2007 | N |
| 3 | 256 | 41 | 227811 | 1990-05-25 | IL | 250/500 | 2000 | 1415.74 | 6000000 | 608117 | ... | 2 | NO | 63400 | 6340 | 6340 | 50720 | Chevrolet | Tahoe | 2014 | Y |
| 4 | 228 | 44 | 367455 | 2014-06-06 | IL | 500/1000 | 1000 | 1583.91 | 6000000 | 610706 | ... | 1 | NO | 6500 | 1300 | 650 | 4550 | Accura | RSX | 2009 | N |
5 rows × 39 columns
data = data.sort_values(by='auto_year')
import warnings
warnings.filterwarnings('ignore')
figure = bubbleplot(dataset = data, x_column = 'policy_annual_premium', y_column = 'total_claim_amount',
bubble_column = 'insured_sex', time_column = 'auto_year', size_column = 'months_as_customer', color_column = 'insured_sex',
x_title = "Annual Policy Premium", y_title = "Total Claim Amount", title = 'Annual Premium vs Total Claim Amount vs Months as Customer',
x_logscale = False, scale_bubble = 3, height = 650)
py.iplot(figure, config={'scrollzoom': True})
trace = go.Histogram(
x = data['insured_education_level'],
name = 'Marvel',
opacity = 0.75,
marker = dict(
color = 'rgb(195, 195, 145, 0.5)'
)
)
df = [trace]
layout = go.Layout(
title = 'Education Level of the Customers')
fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
trace = go.Histogram(
x = data['insured_occupation'],
name = 'Marvel',
opacity = 0.75,
marker = dict(
color = 'rgb(15, 255, 185, 0.5)'
)
)
df = [trace]
layout = go.Layout(
title = 'Occupation of the Customers')
fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
sex = data['insured_sex'].value_counts()
rel = data['insured_relationship'].value_counts()
label_sex = sex.index
size_sex = sex.values
label_rel = rel.index
size_rel = rel.values
colors = ['aqua', 'gold']
trace = go.Pie(
labels = label_sex, values = size_sex, marker = dict(colors = colors), name = 'Gender', hole = 0.3)
colors2 = ['pink', 'lightblue','lightgreen','grey','red']
trace2 = go.Pie(labels = label_rel, values = size_rel, marker = dict(colors = colors2), name = 'Relationship',
hole = 0.3)
df = [trace]
df2 = [trace2]
layout1 = go.Layout(
title = 'Gender of the Customers')
layout2 = go.Layout(
title = 'Relationship')
fig = go.Figure(data = df, layout = layout1)
fig2 = go.Figure(data = df2, layout = layout2)
py.iplot(fig)
py.iplot(fig2)
trace = go.Violin(
x = data['insured_sex'],
y = data['insured_zip'],
name = 'Gender vs Insured Zip',
opacity = 0.75,
marker = dict(
color = 'rgb(215, 5, 185, 0.5)'
)
)
df = [trace]
layout = go.Layout(
title = 'Gender vs Insured Zip')
fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
trace = go.Box(
x = data['auto_make'],
y = data['vehicle_claim'],
opacity = 0.7,
marker = dict(
color = 'rgb(215, 195, 5, 0.5)'
)
)
df = [trace]
layout = go.Layout(
title = 'Automobile Company vs Vehicle Claim')
fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
trace = go.Histogram(
x = data['policy_annual_premium'],
#fill = 'tozeroy',
marker = dict(
color = 'rgb(100, 75, 25, 0.5)'
)
)
df = [trace]
layout = go.Layout(
title = 'Distribution of Annual Policy among the Customers',
scene = dict(
xaxis = dict(title = 'Age'),
yaxis = dict(title = 'Count')
))
fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
trace = go.Histogram(
x = data['age'],
#fill = 'tozeroy',
marker = dict(
color = 'rgb(215, 245, 5, 0.5)'
)
)
df = [trace]
layout = go.Layout(
title = 'Distribution of Age among the Customers',
scene = dict(
xaxis = dict(title = 'Age'),
yaxis = dict(title = 'Count')
))
fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
trace = go.Scatter3d(
x = data['age'],
y = data['property_claim'],
z = data['vehicle_claim'],
mode = 'markers',
marker = dict(
size = 10,
color = data['age']
)
)
df = [trace]
layout = go.Layout(
title = 'Cholestrol vs Heart Rate vs Age',
margin=dict(
l=0,
r=0,
b=0,
t=0
),
scene = dict(
xaxis = dict(title = 'Age'),
yaxis = dict(title = 'Property_claim'),
zaxis = dict(title = 'Vehicle_claim')
)
)
fig = go.Figure(data = df, layout=layout)
py.iplot(fig)
# let's extrat days, month and year from policy bind date
data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], errors = 'coerce')
# let's encode the fraud report to numerical values
data['fraud_reported'] = data['fraud_reported'].replace(('Y','N'),(0,1))
# checking the values of fraud reported
# data['fraud_reported'].value_counts()
# let's check the correlation of authorities_contacted with the target
data[['auto_model','fraud_reported']].groupby(['auto_model'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| auto_model | fraud_reported | |
|---|---|---|
| 0 | 3 Series | 0.944444 |
| 31 | RSX | 0.916667 |
| 25 | Malibu | 0.900000 |
| 36 | Wrangler | 0.880952 |
| 29 | Pathfinder | 0.870968 |
| 35 | Ultima | 0.869565 |
| 9 | Camry | 0.857143 |
| 11 | Corolla | 0.850000 |
| 8 | CRV | 0.850000 |
| 21 | Legacy | 0.843750 |
| 27 | Neon | 0.837838 |
| 3 | 95 | 0.814815 |
| 33 | TL | 0.800000 |
| 2 | 93 | 0.800000 |
| 23 | MDX | 0.777778 |
| 6 | Accord | 0.769231 |
| 17 | Grand Cherokee | 0.760000 |
| 13 | Escape | 0.750000 |
| 12 | E400 | 0.740741 |
| 4 | A3 | 0.729730 |
| 18 | Highlander | 0.727273 |
| 28 | Passat | 0.727273 |
| 1 | 92x | 0.714286 |
| 20 | Jetta | 0.714286 |
| 16 | Fusion | 0.714286 |
| 15 | Forrestor | 0.714286 |
| 26 | Maxima | 0.708333 |
| 19 | Impreza | 0.700000 |
| 37 | X5 | 0.695652 |
| 30 | RAM | 0.674419 |
| 22 | M5 | 0.666667 |
| 5 | A5 | 0.656250 |
| 10 | Civic | 0.636364 |
| 14 | F150 | 0.629630 |
| 34 | Tahoe | 0.625000 |
| 7 | C300 | 0.611111 |
| 24 | ML350 | 0.600000 |
| 32 | Silverado | 0.590909 |
| 38 | X6 | 0.562500 |
# let's perform target encoding for auto make
data['auto_make'] = data['auto_make'].replace(('3 Series','RSX','Malibu','Wrangler','Pathfinder','Ultima','Camry',
'Corolla','CRV','Legacy','Neon','95','TL','93','MDX','Accord','Grand Cherokee','Escape','E4000',
'A3','Highlander','Passat','92x','Jetta','Fusion','Forrestor','Maxima','Impreza','X5','RAM','M5','A5',
'Civic','F150','Tahaoe','C300','ML350','Silverado','X6'),
(0.95,0.91, 0.90,0.88,0.87,0.86,0.855,0.85,0.85,0.84,0.83,0.81,0.80,0.80,0.78,0.77,0.76,0.75,0.74,
0.73,0.72,0.72,0.71,0.71,0.71,0.71,0.70,0.70,0.69,0.67,0.66,0.65,0.64,0.63,0.62,0.61,0.60,0.59,0.56))
# let's check the values
# data['auto_make'].value_counts()
# let's check the correlation auto make with the target
data[['auto_make','fraud_reported']].groupby(['auto_make'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| auto_make | fraud_reported | |
|---|---|---|
| 7 | Jeep | 0.835821 |
| 9 | Nissan | 0.820513 |
| 12 | Toyota | 0.814286 |
| 0 | Accura | 0.808824 |
| 10 | Saab | 0.775000 |
| 11 | Suburu | 0.762500 |
| 4 | Dodge | 0.750000 |
| 6 | Honda | 0.745455 |
| 3 | Chevrolet | 0.723684 |
| 2 | BMW | 0.722222 |
| 13 | Volkswagen | 0.720588 |
| 1 | Audi | 0.695652 |
| 5 | Ford | 0.694444 |
| 8 | Mercedes | 0.661538 |
# let's perform target encoding for auto make
data['auto_make'] = data['auto_make'].replace(('Jeep','Nissan','Toyota','Accura','Saab','Suburu',
'Dodge','Honda','Chevrolet','BMW','Volkswagen','Audi','Ford','Mercedes'),
(0.84,0.82,0.81,0.80,0.77,0.76,0.75,0.74,0.73,0.72,0.71,0.69,0.69,0.66))
# let's check the values
# data['auto_make'].value_counts()
# let's check the correlation of authorities_contacted with the target
data[['police_report_available','fraud_reported']].groupby(['police_report_available'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| police_report_available | fraud_reported | |
|---|---|---|
| 1 | YES | 0.770701 |
| 0 | NO | 0.744898 |
# let's perform target encoding for property damage
data['police_report_available'] = data['police_report_available'].replace(('NO','YES'),(0.77,0.74))
# let's check the values
# data['police_report_available'].value_counts()
# let's check the correlation of authorities_contacted with the target
data[['property_damage','fraud_reported']].groupby(['property_damage'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| property_damage | fraud_reported | |
|---|---|---|
| 0 | NO | 0.757880 |
| 1 | YES | 0.741722 |
# let's perform target encoding for property damage
data['property_damage'] = data['property_damage'].replace(('NO','YES'),(0.76,0.74))
# let's check the values
# data['property_damage'].value_counts()
# let's check the correlation of authorities_contacted with the target
data[['incident_city','fraud_reported']].groupby(['incident_city'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| incident_city | fraud_reported | |
|---|---|---|
| 4 | Northbrook | 0.778689 |
| 5 | Riverwood | 0.776119 |
| 3 | Northbend | 0.765517 |
| 6 | Springfield | 0.757962 |
| 2 | Hillsdale | 0.751773 |
| 1 | Columbus | 0.738255 |
| 0 | Arlington | 0.710526 |
# let's do target encoding for incident city
data['incident_city'] = data['incident_city'].replace(('Northbrook','Riverwood','Northbend','Springfield',
'Hillsdale','Columbus','Arlington'),(0.78,0.77,0.76,0.75,0.74,0.73,0.71))
# let's check the values
# data['incident_city'].value_counts()
# let's check the correlation of authorities_contacted with the target
data[['incident_state','fraud_reported']].groupby(['incident_state'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| incident_state | fraud_reported | |
|---|---|---|
| 6 | WV | 0.820276 |
| 1 | NY | 0.778626 |
| 5 | VA | 0.772727 |
| 3 | PA | 0.733333 |
| 4 | SC | 0.705645 |
| 0 | NC | 0.690909 |
| 2 | OH | 0.565217 |
# let's perform target encoding for incident state
data['incident_state'] = data['incident_state'].replace(('WV','NY','VA','PA','SC','NC','OH'),
(0.82,0.77,0.76,0.73,0.70,0.69,0.56))
# checking the values
# data['incident_state'].value_counts()
# let's check the correlation of authorities_contacted with the target
data[['authorities_contacted','fraud_reported']].groupby(['authorities_contacted'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| authorities_contacted | fraud_reported | |
|---|---|---|
| 2 | None | 0.934066 |
| 4 | Police | 0.791096 |
| 1 | Fire | 0.730942 |
| 0 | Ambulance | 0.709184 |
| 3 | Other | 0.681818 |
# let's perform target encoding for authorities contacted
data['authorities_contacted'] = data['authorities_contacted'].replace(('None','Police','Fire','Ambulance','Other'),
(0.94,0.79,0.73,0.70,0.68))
# let's check the values
#data['authorities'].value_counts()
# let's check the correlation of incident_severity with the target
data[['incident_severity','fraud_reported']].groupby(['incident_severity'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| incident_severity | fraud_reported | |
|---|---|---|
| 3 | Trivial Damage | 0.933333 |
| 1 | Minor Damage | 0.892655 |
| 2 | Total Loss | 0.871429 |
| 0 | Major Damage | 0.394928 |
# let's perform target encoding for incident severity
data['incident_severity'] = data['incident_severity'].replace(('Trivial Damage','Minor Damage','Total Loss',
'Major Damage'),(0.94,0.89,0.87,0.39))
# let's check the values
# data['incident_severity'].value_counts()
# let's check the correlation of collision_type with the target
data[['collision_type','fraud_reported']].groupby(['collision_type'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| collision_type | fraud_reported | |
|---|---|---|
| 1 | Rear Collision | 0.772340 |
| 2 | Side Collision | 0.746377 |
| 0 | Front Collision | 0.724409 |
# let's perform target encoding for collision type
data['collision_type'] = data['collision_type'].replace(('Rear Collision', 'Side Collision', 'Front Collision'),
(0.78,0.74,0.72))
# let's check the values of collision type
# data['collision_type'].value_counts()
# let's check the correlation of incident_type with the target
data[['incident_type','fraud_reported']].groupby(['incident_type'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| incident_type | fraud_reported | |
|---|---|---|
| 3 | Vehicle Theft | 0.914894 |
| 1 | Parked Car | 0.904762 |
| 0 | Multi-vehicle Collision | 0.727924 |
| 2 | Single Vehicle Collision | 0.709677 |
# let's perform target encoing for incident type
data['incident_type'] = data['incident_type'].replace(('Vehicle Theft','Parked Car','Multi-vehicle Collision',
'Single Vehicle Collision'),(0.91, 0.90, 0.72,0.70))
# let's check the values
#data['incident_type'].value_counts()
data['incident_date'] = pd.to_datetime(data['incident_date'], errors = 'coerce')
# extracting days and month from date
data['incident_month'] = data['incident_date'].dt.month
data['incident_day'] = data['incident_date'].dt.day
# let's know the relation between insured_relationship and fraud reported
data[['insured_relationship','fraud_reported']].groupby(['insured_relationship'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| insured_relationship | fraud_reported | |
|---|---|---|
| 0 | husband | 0.794118 |
| 3 | own-child | 0.786885 |
| 4 | unmarried | 0.758865 |
| 1 | not-in-family | 0.741379 |
| 5 | wife | 0.729032 |
| 2 | other-relative | 0.706215 |
# let's do target encoding for insured relationship
data['insured_relationship'] = data['insured_relationship'].replace(('husband','own-child','unmarried',
'not-in-family','wife','other-relative'),(0.79,0.78,0.75,0.74,0.72,0.70))
#data['insured-relationship'].value_counts()
# let's know the relation between insured_hobbies and fraud reported
data[['insured_hobbies','fraud_reported']].groupby(['insured_hobbies'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| insured_hobbies | fraud_reported | |
|---|---|---|
| 4 | camping | 0.909091 |
| 11 | kayaking | 0.907407 |
| 9 | golf | 0.890909 |
| 7 | dancing | 0.883721 |
| 3 | bungie-jumping | 0.839286 |
| 12 | movies | 0.836364 |
| 1 | basketball | 0.823529 |
| 8 | exercise | 0.807018 |
| 17 | sleeping | 0.804878 |
| 18 | video-games | 0.800000 |
| 16 | skydiving | 0.775510 |
| 13 | paintball | 0.771930 |
| 10 | hiking | 0.769231 |
| 0 | base-jumping | 0.734694 |
| 15 | reading | 0.734375 |
| 14 | polo | 0.723404 |
| 2 | board-games | 0.708333 |
| 19 | yachting | 0.698113 |
| 6 | cross-fit | 0.257143 |
| 5 | chess | 0.173913 |
# let's perform target encoding for insured_hobbies
data['insured_hobbies'] = data['insured_hobbies'].replace(('camping', 'kayaking', 'golf','dancing',
'bungie-jumping','movies', 'basketball','exercise','sleeping','video-games','skydiving','paintball',
'hiking','base-jumping','reading','polo','board-games','yachting', 'cross-fit','chess'),(0.91, 0.90,
0.89, 0.88,0.84,0.83,0.82,0.81,0.805,0.80,0.78,0.77,0.76,0.73,0.73,0.72,0.70,0.69,0.25,0.17))
#data['insured_hobbies'].value_counts()
# let's know the relation between insured_occupation and fraud reported
data[['insured_occupation','fraud_reported']].groupby(['insured_occupation'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| insured_occupation | fraud_reported | |
|---|---|---|
| 7 | other-service | 0.830986 |
| 8 | priv-house-serv | 0.830986 |
| 0 | adm-clerical | 0.830769 |
| 5 | handlers-cleaners | 0.796296 |
| 9 | prof-specialty | 0.788235 |
| 10 | protective-serv | 0.777778 |
| 6 | machine-op-inspct | 0.763441 |
| 1 | armed-forces | 0.753623 |
| 11 | sales | 0.723684 |
| 12 | tech-support | 0.717949 |
| 13 | transport-moving | 0.708333 |
| 2 | craft-repair | 0.702703 |
| 4 | farming-fishing | 0.698113 |
| 3 | exec-managerial | 0.631579 |
# let's perform target encoding for insured_occupation
data['insured_occupation'] = data['insured_occupation'].replace(('other-service','priv-house-serv',
'adm-clerical','handlers-cleaners','prof-specialty','protective-serv',
'machine-op-inspct','armed-forces','sales','tech-support','transport-moving','craft-repair',
'farming-fishing','exec-managerial'),(0.84, 0.84,0.83, 0.79,0.78,0.77,0.76,0.75,0.72,0.71,
0.705,0.70,0.69,0.63))
# data['insured_occupation'].value_counts()
# let's know the relation of insured_education_level with faud_reported
data[['insured_education_level','fraud_reported']].groupby(['insured_education_level'],
as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
| insured_education_level | fraud_reported | |
|---|---|---|
| 5 | Masters | 0.776224 |
| 2 | High School | 0.775000 |
| 0 | Associate | 0.765517 |
| 3 | JD | 0.739130 |
| 1 | College | 0.737705 |
| 4 | MD | 0.736111 |
| 6 | PhD | 0.736000 |
# let's perform target encoding
data['insured_education_level'] = data['insured_education_level'].replace(('Masters', 'High School','Associate',
'JD','College', 'MD','PhD'),(0.78,0.77,0.76,0.74,0.73,0.72,0.71))
#data['insured_education_level'].value_counts()
# lets know the relation of insured sex and fraud reported
data[['insured_sex','fraud_reported']].groupby(['insured_sex'], as_index = False).mean().sort_values(
by = 'fraud_reported', ascending = False)
| insured_sex | fraud_reported | |
|---|---|---|
| 0 | FEMALE | 0.765363 |
| 1 | MALE | 0.738661 |
# target encoding for sex
data['insured_sex'] = data['insured_sex'].replace(('FEMALE','MALE'),(0.76,0.73))
#data['insured_sex'].value_counts()
# csl - combined single limit
'''CSL is a single number that describes the predetermined limit for the combined total of the Bodily Injury
Liability coverage and Property Damage Liability coverage per occurrence or accident.'''
# lets know the relation of policy state and fraud reported
data[['policy_csl','fraud_reported']].groupby(['policy_csl'], as_index = False).mean().sort_values(
by = 'fraud_reported', ascending = False)
| policy_csl | fraud_reported | |
|---|---|---|
| 2 | 500/1000 | 0.783333 |
| 0 | 100/300 | 0.742120 |
| 1 | 250/500 | 0.737892 |
# target encoding for policy_csl
data['policy_csl'] = data['policy_csl'].replace(('500/1000','100/300','250/500'),(0.78,0.74,0.73))
# check the values
# data['policy_csl'].value_counts()
# lets know the relation of policy state and fraud reported
data[['policy_state','fraud_reported']].groupby(['policy_state'], as_index = False).mean().sort_values(
by = 'fraud_reported', ascending = False)
| policy_state | fraud_reported | |
|---|---|---|
| 0 | IL | 0.772189 |
| 1 | IN | 0.745161 |
| 2 | OH | 0.741477 |
# target encoding for policy_csl
data['policy_state'] = data['policy_state'].replace(('IL','IN','OH'),(0.77,0.745,0.74))
# check the values
# data['policy_state'].value_counts()
# let's delete unnecassary columns
data = data.drop(['policy_number','policy_bind_date', 'incident_date','incident_location','auto_model'], axis = 1)
# let's check the columns after deleting the columns
data.columns
Index(['months_as_customer', 'age', 'policy_state', 'policy_csl',
'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
'insured_zip', 'insured_sex', 'insured_education_level',
'insured_occupation', 'insured_hobbies', 'insured_relationship',
'capital-gains', 'capital-loss', 'incident_type', 'collision_type',
'incident_severity', 'authorities_contacted', 'incident_state',
'incident_city', 'incident_hour_of_the_day',
'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
'witnesses', 'police_report_available', 'total_claim_amount',
'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
'auto_year', 'fraud_reported', 'incident_month', 'incident_day'],
dtype='object')
# let's split the data into dependent and independent sets
x = data.drop(['fraud_reported'], axis = 1)
y = data['fraud_reported']
print("Shape of x :", x.shape)
print("Shape of y :", y.shape)
Shape of x : (1000, 35) Shape of y : (1000,)
# let's split the dataset into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)
Shape of x_train : (800, 35) Shape of x_test : (200, 35) Shape of y_train : (800,) Shape of y_test : (200,)
plt.rcParams['figure.figsize'] = (15, 10)
sns.heatmap(x_train.corr(), cmap = 'copper')
plt.title('Heat Map for Correlations', fontsize = 20)
plt.show()
Random Forest Classifier
# Random Forest Classifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
model = BalancedRandomForestClassifier(n_estimators = 100, random_state = 0)
model.fit(x_train, y_train)
y_pred_rf = model.predict(x_test)
print("Training Accuracy: ", model.score(x_train, y_train))
print('Testing Accuarcy: ', model.score(x_test, y_test))
# making a classification report
cr = classification_report(y_test, y_pred_rf)
print(cr)
# making a confusion matrix
plt.rcParams['figure.figsize'] = (5, 5)
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot = True, cmap = 'spring')
plt.show()
Training Accuracy: 0.91375
Testing Accuarcy: 0.86
precision recall f1-score support
0 0.65 0.87 0.74 46
1 0.96 0.86 0.90 154
accuracy 0.86 200
macro avg 0.80 0.86 0.82 200
weighted avg 0.88 0.86 0.87 200
Easy Ensemble Classifier
# Easy Ensemble Classifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
model1 = EasyEnsembleClassifier(n_estimators = 100, random_state = 0)
model1.fit(x_train, y_train)
y_pred_ef = model1.predict(x_test)
print("Training Accuracy: ", model1.score(x_train, y_train))
print('Testing Accuarcy: ', model1.score(x_test, y_test))
# making a classification report
cr = classification_report(y_test, y_pred_ef)
print(cr)
# making a confusion matrix
cm = confusion_matrix(y_test, y_pred_ef)
sns.heatmap(cm, annot = True, cmap = 'copper')
plt.show()
Training Accuracy: 0.8425
Testing Accuarcy: 0.825
precision recall f1-score support
0 0.58 0.85 0.69 46
1 0.95 0.82 0.88 154
accuracy 0.82 200
macro avg 0.76 0.83 0.78 200
weighted avg 0.86 0.82 0.83 200
Bagging Classifier
# Random Forest with Bagging Classifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
model2 = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(),
sampling_strategy = 'auto',
replacement = False,
random_state = 0)
model2.fit(x_train, y_train)
y_pred_bc = model2.predict(x_test)
print("Training Accuracy: ", model2.score(x_train, y_train))
print('Testing Accuarcy: ', model2.score(x_test, y_test))
# making a classification report
cr = classification_report(y_test, y_pred_bc)
print(cr)
# making a confusion matrix
cm = confusion_matrix(y_test, y_pred_bc)
sns.heatmap(cm, annot = True, cmap = 'Purples')
plt.show()
Training Accuracy: 0.92625
Testing Accuarcy: 0.855
precision recall f1-score support
0 0.65 0.80 0.72 46
1 0.94 0.87 0.90 154
accuracy 0.85 200
macro avg 0.79 0.84 0.81 200
weighted avg 0.87 0.85 0.86 200
Boosting the Predictions of above Models
# boosting
y_pred = y_pred_rf*0.5 + y_pred_ef*0.2 + y_pred_bc*0.3
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
# making a classification report
cr = classification_report(y_test, y_pred)
print(cr)
# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True, cmap = 'Reds')
plt.show()
precision recall f1-score support
0 0.65 0.87 0.74 46
1 0.96 0.86 0.90 154
accuracy 0.86 200
macro avg 0.80 0.86 0.82 200
weighted avg 0.88 0.86 0.87 200
Voting Classifier
from sklearn.ensemble import VotingClassifier
vote_est = [
('brf', BalancedRandomForestClassifier()),
('bc', BalancedBaggingClassifier()),
('eec',EasyEnsembleClassifier())]
voting = VotingClassifier(estimators = vote_est , voting = 'soft')
voting.fit(x_train, y_train)
y_pred = voting.predict(x_test).astype(int)
# making a classification report
cr = classification_report(y_test, y_pred)
print(cr)
# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True, cmap = 'magma')
plt.show()
precision recall f1-score support
0 0.67 0.89 0.77 46
1 0.96 0.87 0.91 154
accuracy 0.88 200
macro avg 0.82 0.88 0.84 200
weighted avg 0.90 0.88 0.88 200
Under Sampling
y.value_counts()
1 753 0 247 Name: fraud_reported, dtype: int64
frauds = np.array(data[data['fraud_reported'] == 0].index)
no_frauds = len(frauds)
print(no_frauds)
247
normal_indices = data[data['fraud_reported'] == 1]
no_normal_indices = len(normal_indices)
print(no_normal_indices)
753
random_normal_indices = np.random.choice(no_normal_indices, size = no_frauds, replace = True)
random_normal_indices = np.array(random_normal_indices)
print(len(random_normal_indices))
247
under_sample = np.concatenate([frauds, random_normal_indices])
print(len(under_sample))
494
# creating the undersample data
undersample_data = data.iloc[under_sample, :]
# splitting the undersample dataset into x and y sets
x_u = undersample_data.iloc[:, undersample_data.columns != 'fraud_reported']
y_u = undersample_data.iloc[:, undersample_data.columns == 'fraud_reported']
print(x_u.shape)
print(y_u.shape)
(494, 35) (494, 1)
from sklearn.model_selection import train_test_split
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_u, y_u, test_size = 0.2, random_state = 0)
print(x_train1.shape)
print(y_train1.shape)
print(x_test1.shape)
(395, 35) (395, 1) (99, 35)
# standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train1 = sc.fit_transform(x_train1)
x_test1 = sc.transform(x_test1)
from sklearn.ensemble import RandomForestClassifier
model_u = RandomForestClassifier()
model_u.fit(x_train1, y_train1)
y_pred = model_u.predict(x_test1)
print("Training Accuracy: ", model_u.score(x_train1, y_train1))
print('Testing Accuarcy: ', model_u.score(x_test1, y_test1))
# confusion matrix
cm = confusion_matrix(y_test1, y_pred)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, cmap = 'winter')
plt.show()
# classification report
cr = classification_report(y_test1, y_pred)
print(cr)
Training Accuracy: 1.0 Testing Accuarcy: 0.8484848484848485
precision recall f1-score support
0 0.78 0.35 0.48 20
1 0.86 0.97 0.91 79
accuracy 0.85 99
macro avg 0.82 0.66 0.70 99
weighted avg 0.84 0.85 0.82 99
Over Sampling with SMOTE
from imblearn.over_sampling import SMOTE
x_resample, y_resample = SMOTE().fit_sample(x, y.values.ravel())
print(x_resample.shape)
print(y_resample.shape)
----------------------------------------------------------- AttributeError Traceback (most recent call last) Input In [101], in <cell line: 3>() 1 from imblearn.over_sampling import SMOTE ----> 3 x_resample, y_resample = SMOTE().fit_sample(x, y.values.ravel()) 5 print(x_resample.shape) 6 print(y_resample.shape) AttributeError: 'SMOTE' object has no attribute 'fit_sample'
from sklearn.model_selection import train_test_split
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_resample, y_resample, test_size = 0.2, random_state = 0)
print(x_train2.shape)
print(y_train2.shape)
print(x_test2.shape)
print(y_test2.shape)
# standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train2)
x_test2 = sc.transform(x_test2)
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model_o = RandomForestClassifier()
model_o.fit(x_train2, y_train2)
y_pred = model_o.predict(x_test2)
print("Training Accuracy: ", model_o.score(x_train2, y_train2))
print('Testing Accuarcy: ', model_o.score(x_test2, y_test2))
# confusion matrix
cm = confusion_matrix(y_test2, y_pred)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, cmap = 'winter')
plt.show()
# classification report
cr = classification_report(y_test2, y_pred)
print(cr)
# let's check the importance of each attributes
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model, random_state = 0).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x_test.columns.tolist())
#!pip install pdpbox --user
from pdpbox import pdp, info_plots #for partial plots
base_features = x_train.columns.values.tolist()
feat_name = 'incident_severity'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
from pdpbox import pdp, info_plots #for partial plots
base_features = x_train.columns.values.tolist()
feat_name = 'collision_type'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
from pdpbox import pdp, info_plots #for partial plots
base_features = x_train.columns.values.tolist()
feat_name = 'incident_severity'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
from pdpbox import pdp, info_plots #for partial plots
base_features = x_train.columns.values.tolist()
feat_name = 'insured_zip'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
from pdpbox import pdp, info_plots #for partial plots
base_features = x_train.columns.values.tolist()
feat_name = 'age'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
# let's see the shap values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values[1], x_test, plot_type="bar")
shap.summary_plot(shap_values[1], x_test)
# let's create a function to check the patient's conditions
def fraud_analysis(model, fraud):
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(fraud)
shap.initjs()
return shap.force_plot(explainer.expected_value[1], shap_values[1], fraud)
# let's do some real time prediction for patients
fraud = x_test.iloc[1,:].astype(float)
fraud_analysis(model, fraud)
fraud = x_test.iloc[2,:].astype(float)
fraud_analysis(model, fraud)
fraud = x_test.iloc[3,:].astype(float)
fraud_analysis(model, fraud)
fraud = x_test.iloc[4,:].astype(float)
fraud_analysis(model, fraud)
fraud = x_test.iloc[5,:].astype(float)
fraud_analysis(model, fraud)
shap_values = explainer.shap_values(x_train.iloc[:50])
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], x_test.iloc[:50])